db_1 <- 
  "~/Downloads/tmp2/202001_202212_02_mx_youtube_data.db"

db_2 <- 
  "~/Downloads/tmp2/202301_202312_02_mx_youtube_data.db"

db_3 <- 
  "~/Downloads/tmp2/202401_202405_02_mx_youtube_data.db"

db_4 <-
  "~/Downloads/tmp2/202405_02_mx_youtube_data.db"

library(DBI)
library(RSQLite)

videos_2020_24 <- 
  data.frame()

for (db in c(db_1, db_2, db_3, db_4)) {
  
  print(db)
  
  # Connect to database
  con <- dbConnect(RSQLite::SQLite(), db)
  
  
  # Read video searches
  this_videos <- 
    dbGetQuery(con,
               "SELECT video_id, publishedAt, categoryId, commentCount FROM videos")
  
  videos_2020_24 <- 
    rbind(videos_2020_24, 
          this_videos)
  
  # Close connection
  dbDisconnect(con)
  
}

youtube_categories <- data.frame(
  categoryId = 
    as.character(c(2, 1, 10, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 
                 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44)),
  category_label = c(
    "Autos & Vehicles",
    "Film & Animation",
    "Music",
    "Pets & Animals",
    "Sports",
    "Short Movies",
    "Travel & Events",
    "Gaming",
    "Videoblogging",
    "People & Blogs",
    "Comedy",
    "Entertainment",
    "News & Politics",
    "Howto & Style",
    "Education",
    "Science & Technology",
    "Nonprofits & Activism",
    "Movies",
    "Anime/Animation",
    "Action/Adventure",
    "Classics",
    "Comedy",  # Note: ID 34 is also Comedy (duplicate)
    "Documentary",
    "Drama",
    "Family",
    "Foreign",
    "Horror",
    "Sci-Fi/Fantasy",
    "Thriller",
    "Shorts",
    "Shows",
    "Trailers"
  ),
  stringsAsFactors = FALSE
)

library(dplyr)
library(ggplot2)
library(scales)
videos_with_labels <- videos_2020_24 %>%
  left_join(youtube_categories, by = "categoryId")


# ============================================================================
# 1. FREQUENCY DISTRIBUTION
# ============================================================================

category_freq <- videos_with_labels %>%
  group_by(categoryId, category_label) %>%
  summarise(
    n_videos = n(),
    .groups = "drop"
  ) %>%
  arrange(desc(n_videos)) %>%
  mutate(
    percentage = n_videos / sum(n_videos) * 100,
    cum_percentage = cumsum(percentage)
  )

# Bar chart - Video count by category
p1 <- ggplot(category_freq, aes(x = reorder(category_label, n_videos), y = n_videos)) +
  geom_bar(stat = "identity", fill = "steelblue", alpha = 0.8) +
  geom_text(aes(label = comma(n_videos)), hjust = -0.2, size = 3) +
  coord_flip() +
  scale_y_continuous(labels = comma, expand = expansion(mult = c(0, 0.15))) +
  labs(
    title = "Video Distribution by Category",
    subtitle = "Total number of videos per category",
    x = "Category",
    y = "Number of Videos"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    axis.text.y = element_text(size = 10),
    panel.grid.major.y = element_blank()
  )

print(p1)

# ============================================================================
# 2. PERCENTAGE DISTRIBUTION WITH CUMULATIVE LINE
# ============================================================================

p2 <- ggplot(category_freq, aes(x = reorder(category_label, n_videos))) +
  geom_bar(aes(y = percentage), stat = "identity", fill = "steelblue", alpha = 0.8) +
  geom_line(aes(y = cum_percentage, group = 1), colour = "red", linewidth = 1) +
  geom_point(aes(y = cum_percentage), colour = "red", size = 2) +
  geom_text(aes(y = percentage, label = sprintf("%.1f%%", percentage)), 
            hjust = -0.2, size = 3) +
  coord_flip() +
  scale_y_continuous(
    name = "Percentage of Total Videos",
    labels = function(x) paste0(x, "%"),
    expand = expansion(mult = c(0, 0.15)),
    sec.axis = sec_axis(~., name = "Cumulative Percentage", 
                        labels = function(x) paste0(x, "%"))
  ) +
  labs(
    title = "Category Distribution with Cumulative Percentage",
    subtitle = "Red line shows cumulative distribution",
    x = "Category"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    axis.text.y = element_text(size = 10),
    panel.grid.major.y = element_blank(),
    axis.title.y.right = element_text(colour = "red"),
    axis.text.y.right = element_text(colour = "red")
  )

print(p2)

# ============================================================================
# 3. COMMENT STATISTICS BY CATEGORY
# ============================================================================

comment_stats <- videos_with_labels %>%
  group_by(categoryId, category_label) %>%
  summarise(
    n_videos = n(),
    total_comments = sum(commentCount, na.rm = TRUE),
    mean_comments = mean(commentCount, na.rm = TRUE),
    median_comments = median(commentCount, na.rm = TRUE),
    sd_comments = sd(commentCount, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(desc(mean_comments))

# Box plot - Comment distribution by category
p3 <- ggplot(videos_with_labels %>% filter(!is.na(category_label)), 
             aes(x = reorder(category_label, commentCount, FUN = median), 
                 y = commentCount)) +
  geom_boxplot(fill = "lightblue", alpha = 0.7, outlier.alpha = 0.3) +
  stat_summary(fun = mean, geom = "point", shape = 23, size = 3, 
               fill = "red", colour = "darkred") +
  coord_flip() +
  scale_y_continuous(labels = comma, trans = "log10") +
  labs(
    title = "Comment Distribution by Category",
    subtitle = "Box plots show median (line) and mean (red diamond)\nLog scale",
    x = "Category",
    y = "Number of Comments (log scale)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    axis.text.y = element_text(size = 10)
  )

print(p3)

# Bar chart - Mean comments per category
p4 <- ggplot(comment_stats, aes(x = reorder(category_label, mean_comments), 
                                y = mean_comments)) +
  geom_bar(stat = "identity", fill = "coral", alpha = 0.8) +
  geom_errorbar(aes(ymin = mean_comments - sd_comments/sqrt(n_videos), 
                    ymax = mean_comments + sd_comments/sqrt(n_videos)),
                width = 0.2, alpha = 0.6) +
  geom_text(aes(label = sprintf("%.1f", mean_comments)), 
            hjust = -0.2, size = 3) +
  coord_flip() +
  scale_y_continuous(labels = comma, expand = expansion(mult = c(0, 0.15))) +
  labs(
    title = "Average Comments per Video by Category",
    subtitle = "Error bars show standard error of the mean",
    x = "Category",
    y = "Mean Number of Comments"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    axis.text.y = element_text(size = 10),
    panel.grid.major.y = element_blank()
  )

print(p4)

# ============================================================================
# 4. TEMPORAL DISTRIBUTION
# ============================================================================

# Videos over time by category
temporal_dist <- videos_with_labels %>%
  mutate(year_month = floor_date(as.Date(publishedAt), "month")) %>%
  group_by(year_month, category_label) %>%
  summarise(n_videos = n(), .groups = "drop")

# Top 5 categories for clarity
top_categories <- category_freq %>% slice_head(n = 5) %>% pull(category_label)

p5 <- temporal_dist %>%
  filter(category_label %in% top_categories) %>%
  ggplot(aes(x = year_month, y = n_videos, colour = category_label)) +
  geom_line(linewidth = 1) +
  geom_point(size = 1.5) +
  scale_x_date(date_breaks = "6 months", date_labels = "%b %Y") +
  scale_y_continuous(labels = comma) +
  scale_colour_brewer(palette = "Set1", name = "Category") +
  labs(
    title = "Video Upload Trends Over Time",
    subtitle = "Top 5 categories by video count",
    x = "Month",
    y = "Number of Videos"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "bottom"
  )

print(p5)


# ============================================================================
# 5. SUMMARY TABLE
# ============================================================================

summary_table <- videos_with_labels %>%
  group_by(category_label) %>%
  summarise(
    N_Videos = n(),
    Pct_Total = sprintf("%.1f%%", n() / nrow(videos_with_labels) * 100),
    Total_Comments = sum(commentCount, na.rm = TRUE),
    Mean_Comments = round(mean(commentCount, na.rm = TRUE), 1),
    Median_Comments = median(commentCount, na.rm = TRUE),
    SD_Comments = round(sd(commentCount, na.rm = TRUE), 1),
    .groups = "drop"
  ) %>%
  arrange(desc(N_Videos))

print(summary_table)
write.csv(summary_table, "category_summary_statistics.csv", row.names = FALSE)

library(patchwork)

combined_plot <- (p1 | p4) / (p2 | p3)

combined_plot <- combined_plot +
  plot_annotation(
    title = "YouTube Video Category Analysis",
    theme = theme(plot.title = element_text(size = 16, face = "bold"))
  )

print(combined_plot)

ggsave("category_analysis_combined.png", combined_plot, 
       width = 16, height = 12, dpi = 300)
